package com.spiderman.main;

import java.util.ArrayList;
import java.util.List;

import com.spiderman.entity.TianMaoListItem;
import com.spiderman.utils.ConnectionUtil;
import com.spiderman.utils.RegExUtil;

/**
 * Description: 天猫列表爬虫
 * @author Kwum
 * @date 2017年8月5日 下午3:10:27
 * @version 1.0
 */

public class GetTianMaoList {

    //自定义正则表达式，需要找的一段数据先从html中复制出来，变量改成(.*?)，"改成\"
    private static final String regEx_tianmao = "<dlclass=\"item\"\"data-id=\"(.*?)\">"
            + "\n<dtclass=\"photo\">"
            + "\n<aclass=(.*?)></a>"
            + "\n</dt>"
            + "\n<!--item.discntPrice:(.*?)-->"
            + "\n<ddclass=\"detail\">"
            + "\n<aclass=(.*?)>(.*?)</a>"
            + "\n<divclass=\"attribute\">"
            + "\n<divclass=\"cprice-area\">"
            + "\n<spanclass=\"symbol\">&yen;</span>"
            + "\n<spanclass=\"c-price\">(.*?)</span>"
            + "\n</div>"
            + "\n<divclass=\"sale-area\">"
            + "\n总销量："
            + "\n<spanclass=\"sale-num\">(.*?)</span>"
            + "\n</div>"
            + "\n</div>"
            + "\n</dd>"
            + "\n<ddclass=\"rates\">"
            + "\n<divclass=\"title\">"
            + "\n<h4>(.*?)><span>评价:(.*?)</span></a></h4>"
            + "\n</div>"
            + "\n</dd>"
            + "\n</dl>";

    public static void main(String[] args) {
        List<TianMaoListItem> list = new ArrayList<>();
        list = RegExUtil.getTianMaoListItem(ConnectionUtil.connectByJsoup("https://jinfuli.tmall.com/i/asynSearch.htm?_ksTS=1501916570616_191&callback=jsonp192&mid=w-16356460438-0&wid=16356460438&path=/search.htm&search=y&spm=a1z10.1-b-s.w5001-16356460381.3.72faef38ivJNaj&scene=taobao_shop"), regEx_tianmao);
        for(TianMaoListItem tm : list){
            System.out.println(tm.toString());
        }
    }
}
